Take a ZIP file) of images and process them, using a library built into python that you need to learn how to use. A ZIP file takes several different files and compresses them, thus saving space, into one single file. The files in the ZIP file we provide are newspaper images (like you saw in week 3). Your task is to write python code which allows one to search through the images looking for the occurrences of keywords and faces. E.g. if you search for "pizza" it will return a contact sheet of all of the faces which were located on the newspaper page which mentions "pizza". This will test your ability to learn a new (library), your ability to use OpenCV to detect faces, your ability to use tesseract to do optical character recognition, and your ability to use PIL to composite images together into contact sheets.
Each page of the newspapers is saved as a single PNG image in a file called images.zip. These newspapers are in english, and contain a variety of stories, advertisements and images. Note: This file is fairly large (~200 MB) and may take some time to work with, I would encourage you to use small_img.zip for testing.
Here's an example of the output expected. Using the small_img.zip file, if I search for the string "Christopher" I should see the following image:
If I were to use the images.zip file and search for "Mark" I should see the following image (note that there are times when there are no faces on a page, but a word is found!):

Note: That big file can take some time to process - for me it took nearly ten minutes! Use the small one for testing.
import zipfile
from PIL import Image
import pytesseract
import cv2 as cv
import numpy as np
from IPython.display import display
# loading the face detection classifier
haar_cascade = cv.CascadeClassifier('readonly/haarcascade_frontalface_default.xml')
# the rest is up to you!
def get_grayscale(image):
return cv.cvtColor(image, cv.COLOR_BGR2GRAY)
def find_word_in_image(page,img,word):
preProcessed = get_grayscale(img)
d = pytesseract.image_to_data(preProcessed)
for n,i in enumerate(d.splitlines()):
i=i.split('\t')
if len(i) > 11 and (word in i[11] and n >0):
print(f"\"{word}\" found in {page}")
extract_faces(page,img)
break
def extract_faces(page,img):
gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
#blur = cv.GaussianBlur(gray,(7,7),cv.BORDER_DEFAULT)
#threshold, thresh = cv.threshold (gray,80,255, cv.THRESH_BINARY)
faces_rect = haar_cascade.detectMultiScale(gray,scaleFactor = 1.3, minNeighbors=3)
if len(faces_rect) > 0: ##If faces are found
print(f'{len(faces_rect)} faces found in {page}')
#Context sheet size based on number of faces found
sheet_height = len(faces_rect)//5
if len(faces_rect)%5 > 0:
sheet_height = sheet_height + 1
sheet = np.zeros((350*sheet_height, 350*5,3),dtype=np.uint8)
sx=0
sy=0
for (x,y,w,h) in faces_rect:
#Cropping face
face=img[y:y+h,x:x+w]
#Resizing face
face_resized = cv.resize(face,(350,350),interpolation=cv.INTER_CUBIC)
#Populating sheet
sheet[sy:sy+350,sx:sx+350] = face_resized
if sx == 350 * 4:
sx = 0
sy = sy + 350
else:
sx = sx + 350
#cv.rectangle(img, (x,y), (x+w,y+h), (0,255,0), thickness=5)
#Convert to RGB for PIL display
sheet = cv.cvtColor(sheet,cv.COLOR_BGR2RGB)
#cv.imshow('Detected Faces', img)
display(Image.fromarray(sheet))
else:
print(f"But no faces found in {page}")
try:
word = input ("Enter word to search : ")
except:
print("Input error, try again")
with zipfile.ZipFile('readonly/small_img.zip','r') as news_zip:
news_zip.extractall('news_papers')
#word = "Christopher"
for im in news_zip.namelist():
img = cv.imread(f'news_papers/{im}')
print(f'Processing {im}')
find_word_in_image(im,img,word)
import zipfile
from PIL import Image
import pytesseract
import cv2 as cv
import numpy as np
from IPython.display import display
# loading the face detection classifier
haar_cascade = cv.CascadeClassifier('readonly/haarcascade_frontalface_default.xml')
# the rest is up to you!
def get_grayscale(image):
return cv.cvtColor(image, cv.COLOR_BGR2GRAY)
def find_word_in_image(page,img,word):
preProcessed = get_grayscale(img)
d = pytesseract.image_to_data(preProcessed)
for n,i in enumerate(d.splitlines()):
i=i.split('\t')
if len(i) > 11 and (word in i[11] and n >0):
print(f"\"{word}\" found in {page}")
extract_faces(page,img)
break
def extract_faces(page,img):
gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
#blur = cv.GaussianBlur(gray,(7,7),cv.BORDER_DEFAULT)
#threshold, thresh = cv.threshold (gray,80,255, cv.THRESH_BINARY)
faces_rect = haar_cascade.detectMultiScale(gray,scaleFactor = 1.5, minNeighbors=3)
if len(faces_rect) > 0: ##If faces are found
print(f'{len(faces_rect)} faces found in {page}')
#Context sheet size based on number of faces found
sheet_height = len(faces_rect)//5
if len(faces_rect)%5 > 0:
sheet_height = sheet_height + 1
sheet = np.zeros((350*sheet_height, 350*5,3),dtype=np.uint8)
sx=0
sy=0
for (x,y,w,h) in faces_rect:
#Cropping face
face=img[y:y+h,x:x+w]
#Resizing face
face_resized = cv.resize(face,(350,350),interpolation=cv.INTER_CUBIC)
#Populating sheet
sheet[sy:sy+350,sx:sx+350] = face_resized
if sx == 350 * 4:
sx = 0
sy = sy + 350
else:
sx = sx + 350
#cv.rectangle(img, (x,y), (x+w,y+h), (0,255,0), thickness=5)
#Convert to RGB for PIL display
sheet = cv.cvtColor(sheet,cv.COLOR_BGR2RGB)
#cv.imshow('Detected Faces', img)
display(Image.fromarray(sheet))
else:
print(f"But no faces found in {page}")
try:
word = input ("Enter word to search : ")
except:
print("Input error, try again")
with zipfile.ZipFile('readonly/images.zip','r') as news_zip:
news_zip.extractall('news_papers')
#word = "Christopher"
for im in news_zip.namelist():
img = cv.imread(f'news_papers/{im}')
print(f'Processing {im}')
find_word_in_image(im,img,word)